library(tidyverse)
## -- Attaching packages ------------------------------------------------------------------ tidyverse 1.2.1 --
## v ggplot2 3.1.0 v purrr 0.3.2
## v tibble 2.1.1 v dplyr 0.8.0.1
## v tidyr 0.8.3 v stringr 1.4.0
## v readr 1.3.1 v forcats 0.4.0
## -- Conflicts --------------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(DT)
library(htmltools)
Setup
options = params$options
targetVar = params$targetVar
targetUnits = ifelse(targetVar=='Throughput','MB/s','secs')
showOutliers = params$showOutliers
outliersQuantile = ifelse(showOutliers, 1 ,0.90) #use 1.0 to include all outliers
outliersShape = ifelse(showOutliers, 19,NA)
plotCols=2
Reading Files
read.clean.files = function(filename){
file = read.csv(filename, header = FALSE)
column.names = c("Language","Randomize","Dataset","MachineID","RunID","Type","Operation","TimeTaken")
colnames(file) = column.names
return(file)
}
files = list.files(path = "../Results/", pattern = ".csv$", recursive = TRUE, full.names = TRUE) # List all .csv files
#files
databricks.files = files[grepl("Databricks",files)]
local.vm..files = files[grepl("Local_VM",files)]
cluster.files = files[grepl("Cluster",files)]
rows.databricks = lapply(databricks.files, read.csv, header = FALSE) # Read the files into list
merged.databricks = do.call(rbind, rows.databricks) # combine the data.frame
merged.databricks$Setup = 'Databricks'
rows.local.vm = lapply(local.vm..files, read.csv, header = FALSE) # Read the files into list
merged.local.vm = do.call(rbind, rows.local.vm) # combine the data.frame
merged.local.vm$Setup = 'Local VM'
rows.cluster = lapply(cluster.files, read.csv, header = FALSE) # Read the files into list
merged.cluster = do.call(rbind, rows.cluster) # combine the data.frame
merged.cluster$Setup = 'Cluster'
merged_data = rbind(merged.databricks,merged.local.vm,merged.cluster)
merged_data$Setup = as.factor(merged_data$Setup)
column.names = c("Language","Randomize","Dataset","MachineID","RunID","Type","Operation","TimeTaken","Setup")
colnames(merged_data) = column.names
merged_data$Type = as.factor(gsub(pattern = "Operations", replacement = "Operation", x = merged_data$Type))
# Convert columns to factors
merged_data$MachineID = as.factor(merged_data$MachineID)
merged_data$Randomize = as.factor(merged_data$Randomize)
merged_data$RunID = as.factor(merged_data$RunID)
merged_data$Dataset = sub("dataset_", "", merged_data$Dataset)
merged_data$Dataset = sub("MB$", "", merged_data$Dataset)
merged_data$Dataset = as.factor(merged_data$Dataset)
merged_data$Operation = trimws(as.character(merged_data$Operation),'both')
merged_data[merged_data$Operation =='Mathematical Operations on Columns',]$Operation ='Mathematical Operation on Columns'
merged_data$subType = ''
merged_data[grepl('^Sorting',merged_data$Operation),]$subType = 'Sorting'
merged_data[grepl('^Ranking',merged_data$Operation),]$subType = 'Ranking'
merged_data[grepl('^Split',merged_data$Operation),]$subType = 'Splitting'
merged_data[grepl('^GroupBy',merged_data$Operation),]$subType = 'Grouping'
merged_data[grepl(' Join ',merged_data$Operation),]$subType = 'Joining'
merged_data[grepl('^Merge',merged_data$Operation),]$subType = 'Merging'
merged_data[grepl('^Filter',merged_data$Operation),]$subType = 'Filtering'
merged_data[grepl('^Mathematical',merged_data$Operation),]$subType = 'Mathematics'
merged_data[grepl('^Pivot',merged_data$Operation),]$subType = 'Pivots'
merged_data[grepl('^Running|^Shift',merged_data$Operation),]$subType = 'Run/Shift'
merged_data[grepl('^Writing',merged_data$Operation),]$subType = 'Writing'
size_10MB = 11.4789848327637 # file.size("../../Data/Databricks/machine2/dataset_10MB.csv")/(1024*1024)
size_100MB = 115.640992164612 # file.size("../../Data/Databricks/machine2/dataset_100MB.csv")/(1024*1024)
size_200MB = 229.8573
size_300MB = 343.2709
size_500MB = 576.678165435791 # file.size("../../Data/Databricks/machine2/dataset_500MB.csv")/(1024*1024)
print(paste("Actual Size of 10MB file (in MB)",size_10MB))
## [1] "Actual Size of 10MB file (in MB) 11.4789848327637"
print(paste("Actual Size of 100MB file (in MB)",size_100MB))
## [1] "Actual Size of 100MB file (in MB) 115.640992164612"
print(paste("Actual Size of 200MB file (in MB)",size_200MB))
## [1] "Actual Size of 200MB file (in MB) 229.8573"
print(paste("Actual Size of 300MB file (in MB)",size_300MB))
## [1] "Actual Size of 300MB file (in MB) 343.2709"
print(paste("Actual Size of 500MB file (in MB)",size_500MB))
## [1] "Actual Size of 500MB file (in MB) 576.678165435791"
size_info = data.frame(Dataset = c("10","100","200","300","500")
,Size = c(size_10MB,size_100MB,size_200MB,size_300MB,size_500MB))
str(size_info)
## 'data.frame': 5 obs. of 2 variables:
## $ Dataset: Factor w/ 5 levels "10","100","200",..: 1 2 3 4 5
## $ Size : num 11.5 115.6 229.9 343.3 576.7
merged_data = merged_data %>%
merge(size_info,by='Dataset') %>%
mutate(Throughput = Size/TimeTaken)
Setups = c(unique(as.character(merged_data$Setup)))
data_raw = merged_data
merged_data = merged_data %>%
filter(RunID != 1)
str(merged_data)
## 'data.frame': 8357 obs. of 12 variables:
## $ Dataset : Factor w/ 5 levels "10","100","200",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ Language : Factor w/ 2 levels "PySpark","Scala": 1 1 1 1 1 1 1 1 1 1 ...
## $ Randomize : Factor w/ 1 level "1": 1 1 1 1 1 1 1 1 1 1 ...
## $ MachineID : Factor w/ 2 levels "1","2": 1 1 1 1 1 1 1 1 1 1 ...
## $ RunID : Factor w/ 6 levels "1","2","3","4",..: 6 6 5 6 6 6 6 5 5 5 ...
## $ Type : Factor w/ 4 levels "Aggregate Operation",..: 2 3 4 4 2 2 2 2 2 3 ...
## $ Operation : chr "Merge 5 columns into 1" "Pivot 10 Rows and 1 Column" "Writing 10000 new rows" "Writing 10000 new rows" ...
## $ TimeTaken : num 2.33 12.4 3.73 3.01 4.82 ...
## $ Setup : Factor w/ 3 levels "Cluster","Databricks",..: 2 2 2 2 2 2 2 2 2 2 ...
## $ subType : chr "Merging" "Pivots" "Writing" "Writing" ...
## $ Size : num 11.5 11.5 11.5 11.5 11.5 ...
## $ Throughput: num 4.924 0.926 3.076 3.811 2.379 ...
head(merged_data)
## Dataset Language Randomize MachineID RunID Type
## 1 10 PySpark 1 1 6 Column Operation
## 2 10 PySpark 1 1 6 Mixed Operation
## 3 10 PySpark 1 1 5 Row Operation
## 4 10 PySpark 1 1 6 Row Operation
## 5 10 PySpark 1 1 6 Column Operation
## 6 10 PySpark 1 1 6 Column Operation
## Operation TimeTaken Setup subType Size
## 1 Merge 5 columns into 1 2.331200 Databricks Merging 11.47898
## 2 Pivot 10 Rows and 1 Column 12.402202 Databricks Pivots 11.47898
## 3 Writing 10000 new rows 3.732033 Databricks Writing 11.47898
## 4 Writing 10000 new rows 3.011919 Databricks Writing 11.47898
## 5 Left Outer Join 5 Columns 4.824355 Databricks Joining 11.47898
## 6 Left Outer Join 3 Columns 10.807082 Databricks Joining 11.47898
## Throughput
## 1 4.9240672
## 2 0.9255602
## 3 3.0757992
## 4 3.8111861
## 5 2.3793822
## 6 1.0621724
summary(merged_data)
## Dataset Language Randomize MachineID RunID
## 10 :2100 PySpark:4697 1:8357 1:5357 1: 0
## 100:1950 Scala :3660 2:3000 2:1890
## 200:1485 3:1863
## 300:1472 4:1647
## 500:1350 5:1485
## 6:1472
## Type Operation TimeTaken
## Aggregate Operation:1118 Length:8357 Min. : 0.230
## Column Operation :4164 Class :character 1st Qu.: 3.208
## Mixed Operation : 839 Mode :character Median : 10.105
## Row Operation :2236 Mean : 25.053
## 3rd Qu.: 25.786
## Max. :631.088
## Setup subType Size Throughput
## Cluster :1500 Length:8357 Min. : 11.48 Min. : 0.273
## Databricks:3317 Class :character 1st Qu.: 11.48 1st Qu.: 5.535
## Local VM :3540 Mode :character Median :229.86 Median : 12.702
## Mean :224.33 Mean : 23.838
## 3rd Qu.:343.27 3rd Qu.: 24.951
## Max. :576.68 Max. :268.098
print(!(.Platform$GUI == "RStudio"))
## [1] TRUE
Common Functions
subchunkify_clear <- function() {
files=dir(path='figure/',pattern='sub_chunk_',include.dirs=T,full.names = T)
all(file.remove(files))
}
subchunkify <- function(g, fig_height=7, fig_width=5, enabled = !(.Platform$GUI == "RStudio")) {
if (enabled==FALSE) return(plot(g))
g_deparsed <- paste0(deparse(
function() {g}
), collapse = '')
sub_chunk <- paste0("
`","``{r sub_chunk_", floor(runif(1) * 10000), ", fig.height=", fig_height, ", fig.width=", fig_width, ", echo=FALSE}",
"\n(",
g_deparsed
, ")()",
"\n`","``
")
cat(knitr::knit(text = knitr::knit_expand(text = sub_chunk), quiet = TRUE))
}
ggplot_colors = function(plot,strip_angle=0,...){
plot +
scale_color_manual(values=c("#ca0020","#0571b0"),breaks=c('PySpark','Scala'))+
scale_fill_manual(values=c("#f4a582","#92c5de"),breaks=c('PySpark','Scala'))+
theme_light() +
ggplot2::theme(strip.text.y=element_text(angle=strip_angle)
,strip.background = element_rect(fill='#636363')
,...)
}
Simple Linear Regression Model
using a simple LM we can identify the elements that contribute to the speed of the queries, we see Scala has a coeffiecnet of -7, meaning that it bring an overall benefits of 7 seconds, keeping constant the other variables.
subchunkify_clear()
## [1] TRUE
form=as.formula(paste0(targetVar,' ~ Dataset + Language + Operation + Setup'))
model = lm(data=merged_data,formula=form)
summary(model)
##
## Call:
## lm(formula = form, data = merged_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -86.528 -8.215 -0.664 7.298 195.047
##
## Coefficients:
## Estimate Std. Error t value
## (Intercept) 49.8253 1.4343 34.738
## Dataset100 16.6170 0.6705 24.783
## Dataset200 18.8642 0.7374 25.581
## Dataset300 17.0566 0.7393 23.070
## Dataset500 18.2446 0.7578 24.077
## LanguageScala 9.8842 0.4644 21.283
## OperationFilter Reg Ex 1 14.4826 1.7726 8.170
## OperationFilter Reg Ex 2 15.0320 1.7726 8.480
## OperationFull Outer Join 10 Columns -35.4569 2.3737 -14.938
## OperationFull Outer Join 3 Columns -30.6454 3.2451 -9.444
## OperationFull Outer Join 5 Columns -37.4714 2.3737 -15.786
## OperationGroupBy 1 column -38.2980 1.7726 -21.605
## OperationGroupBy 10 columns -43.4185 1.7710 -24.516
## OperationGroupBy 5 columns -41.4954 1.7726 -23.409
## OperationInner Join 10 Columns -33.1897 2.3737 -13.982
## OperationInner Join 3 Columns -29.5554 3.2451 -9.108
## OperationInner Join 5 Columns -34.4918 2.3737 -14.531
## OperationLeft Outer Join 10 Columns -33.6506 2.3737 -14.177
## OperationLeft Outer Join 3 Columns -29.5222 3.2451 -9.097
## OperationLeft Outer Join 5 Columns -36.0190 2.3737 -15.174
## OperationMathematical Operation on Columns 54.0242 1.7726 30.477
## OperationMerge 10 columns into 1 -27.4617 1.7726 -15.492
## OperationMerge 2 columns into 1 -25.5555 1.7710 -14.430
## OperationMerge 5 columns into 1 -26.4484 1.7710 -14.934
## OperationPivot 1 Rows and 1 Column -10.8216 1.7710 -6.110
## OperationPivot 10 Rows and 1 Column -32.0375 1.7710 -18.090
## OperationPivot 5 Rows and 1 Column -28.5859 1.7726 -16.126
## OperationRanking by Group -31.6886 1.7710 -17.893
## OperationRunning Sum 2.7282 1.7710 1.540
## OperationShift (Lag) 5.6180 1.7710 3.172
## OperationSorting Asc 1 column -37.3011 1.7710 -21.062
## OperationSorting Asc 10 column -37.8982 1.7710 -21.399
## OperationSorting Asc 5 column -37.4642 1.7726 -21.135
## OperationSorting Desc 1 column -37.2524 1.7726 -21.015
## OperationSorting Desc 10 column -37.8062 1.7726 -21.328
## OperationSorting Desc 5 column -37.4615 1.7726 -21.133
## OperationSplit 1 Column into 10 -32.1950 1.7710 -18.179
## OperationSplit 1 Column into 5 -29.7700 1.7710 -16.809
## OperationWriting 100 new rows -22.7522 1.7710 -12.847
## OperationWriting 1000 new rows -23.0391 1.7726 -12.997
## OperationWriting 10000 new rows -25.4278 1.7710 -14.357
## SetupDatabricks -22.4437 0.6524 -34.403
## SetupLocal VM -28.6507 0.6470 -44.284
## Pr(>|t|)
## (Intercept) < 2e-16 ***
## Dataset100 < 2e-16 ***
## Dataset200 < 2e-16 ***
## Dataset300 < 2e-16 ***
## Dataset500 < 2e-16 ***
## LanguageScala < 2e-16 ***
## OperationFilter Reg Ex 1 3.53e-16 ***
## OperationFilter Reg Ex 2 < 2e-16 ***
## OperationFull Outer Join 10 Columns < 2e-16 ***
## OperationFull Outer Join 3 Columns < 2e-16 ***
## OperationFull Outer Join 5 Columns < 2e-16 ***
## OperationGroupBy 1 column < 2e-16 ***
## OperationGroupBy 10 columns < 2e-16 ***
## OperationGroupBy 5 columns < 2e-16 ***
## OperationInner Join 10 Columns < 2e-16 ***
## OperationInner Join 3 Columns < 2e-16 ***
## OperationInner Join 5 Columns < 2e-16 ***
## OperationLeft Outer Join 10 Columns < 2e-16 ***
## OperationLeft Outer Join 3 Columns < 2e-16 ***
## OperationLeft Outer Join 5 Columns < 2e-16 ***
## OperationMathematical Operation on Columns < 2e-16 ***
## OperationMerge 10 columns into 1 < 2e-16 ***
## OperationMerge 2 columns into 1 < 2e-16 ***
## OperationMerge 5 columns into 1 < 2e-16 ***
## OperationPivot 1 Rows and 1 Column 1.04e-09 ***
## OperationPivot 10 Rows and 1 Column < 2e-16 ***
## OperationPivot 5 Rows and 1 Column < 2e-16 ***
## OperationRanking by Group < 2e-16 ***
## OperationRunning Sum 0.12349
## OperationShift (Lag) 0.00152 **
## OperationSorting Asc 1 column < 2e-16 ***
## OperationSorting Asc 10 column < 2e-16 ***
## OperationSorting Asc 5 column < 2e-16 ***
## OperationSorting Desc 1 column < 2e-16 ***
## OperationSorting Desc 10 column < 2e-16 ***
## OperationSorting Desc 5 column < 2e-16 ***
## OperationSplit 1 Column into 10 < 2e-16 ***
## OperationSplit 1 Column into 5 < 2e-16 ***
## OperationWriting 100 new rows < 2e-16 ***
## OperationWriting 1000 new rows < 2e-16 ***
## OperationWriting 10000 new rows < 2e-16 ***
## SetupDatabricks < 2e-16 ***
## SetupLocal VM < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 20.94 on 8314 degrees of freedom
## Multiple R-squared: 0.6093, Adjusted R-squared: 0.6073
## F-statistic: 308.7 on 42 and 8314 DF, p-value: < 2.2e-16
Comparing of Environments
Comparing Local VM vs. Cloud similar in configuration, controlling by the type operation. The cloud machine is optimized for Scala and Spark, while the local machine has a standard installation of the softwars.
We notice the DataBricks environment benefit the aggregate operation, and that Databricks beenfits the scala encironemtn for the Row Operations. Anyway, other sitation shows a similar perfomance across lancuages and environment
filtered= merged_data
p=ggplot(data=filtered, aes_string(x='Setup',y=targetVar,color='Language',fill='Language')) +
geom_boxplot(outlier.size = 0.1,size=0.1,outlier.shape = outliersShape) +
scale_y_continuous(limits = c(0, quantile(filtered[[targetVar]],probs=outliersQuantile))
,labels=scales::comma_format(accuracy=1,suffix=targetUnits)) +
facet_wrap(~Type,scales='free',ncol=plotCols)
ggplot_colors(p,axis.text.y=element_text(size=7),legend.position='right')
## Warning: Removed 836 rows containing non-finite values (stat_boxplot).

Comparing different data sizes
Comparing the peromance of the differnt queries among different sizes of dataset. We notice that the increased size has an import impact on the time. PySpark is suffering more by the increased size, mainly for Row and Column Operations, while Scala language shows a better performance. We can see
plotDataSize=function(filtered,title){
t = filtered
p=ggplot(data=t, aes_string(x='Dataset',y=targetVar,color='Language',fill='Language')) +
geom_boxplot(outlier.size = 0.1,size=0.1,outlier.shape=outliersShape) +
scale_y_continuous(limits = c(0, quantile(filtered[[targetVar]],probs=outliersQuantile))
,labels=scales::comma_format(accuracy=1,suffix=targetUnits)) +
facet_wrap(~Type ,labeller=label_wrap_gen(width=100),scales='free_x',ncol=plotCols) +
ggplot2::ggtitle(title)
ggplot_colors(p,strip_angle=0
,axis.text.y=element_text(size=7)
,axis.text.x=element_text(size=9))
}
All Setups
filtered= merged_data
plotDataSize(filtered,'Comparing Datsets - All Environments')
## Warning: Removed 836 rows containing non-finite values (stat_boxplot).

By Setup
for (s in Setups) {
message(s)
filtered= merged_data %>% filter(Setup==s)
plot(plotDataSize(filtered,paste0('Comparing Datsets - ',s)))
}
## Databricks
## Warning: Removed 332 rows containing non-finite values (stat_boxplot).
## Local VM

## Warning: Removed 354 rows containing non-finite values (stat_boxplot).
## Cluster

## Warning: Removed 150 rows containing non-finite values (stat_boxplot).

Comparing Query Types
As expected, the slowest queries are the Aggregate one,while the fastest ones are the Row Operations.Scala outperform PySpark on the row operations, while we don’t see a significant differnece on the other query types
filtered=merged_data
filtered$Type = forcats::fct_reorder(filtered$Type,filtered[[targetVar]],.fun=median,.desc=F)
p=ggplot(data=filtered, aes_string(x='Type',y=targetVar,color='Language',fill='Language')) +
geom_boxplot(outlier.size = 0.1,size=0.1,outlier.shape=outliersShape) +
scale_y_continuous(limits = c(0, quantile(filtered[[targetVar]],probs=outliersQuantile))
,labels=scales::comma_format(accuracy=1,suffix=targetUnits)) +
facet_wrap(~Setup,ncol=3) +
ggplot2::coord_flip()
ggplot_colors(p,axis.text.x=element_text(size=8)
,panel.spacing.x=unit(10,'points')
,legend.position='bottom')
## Warning: Removed 836 rows containing non-finite values (stat_boxplot).

Comparing Queries
#functions
plot_get_significant = function(filtered){
w<-function(oper){wilcox.test(as.formula(paste0(targetVar,' ~ Language ')),alternative='two.sided'
,filtered,subset = Operation == oper)$p.value}
ops = unique(filtered$Operation)
test=sapply(ops,w)
keep= ops[which(test < 0.025)]
data2 = filtered[filtered$Operation %in% keep,]
if(nrow(data2)==0) {message("No Significant Elements") ; return(NULL)}
return(data2)
}
plot_operations_get = function(filtered,title='',subtitle='',ncol = plotCols){
if(is.null(filtered)) return(ggplot(data=filtered)+geom_blank())
plotData = filtered %>% mutate(Operation = factor(Operation))
p= ggplot(data=plotData, aes_string(x='Dataset',y=targetVar,color='Language')) +
ggplot2::stat_summary(aes(group=Language),fun.y=median,geom='line',size=.5 )+
geom_jitter(alpha=0.5,size=.5,width=0.1)+
scale_y_continuous(labels=scales::comma_format(accuracy=1,suffix=targetUnits)) +
facet_wrap(~ Operation,scales='free',ncol=ncol,labeller = label_wrap_gen(width = round(100/ncol))) +
#facet_grid(Operation~subType,scales='free',labeller = label_wrap_gen()) +
ggplot2::ggtitle(title,subtitle=subtitle)
ggplot_colors(p,legend.position = 'bottom'
,axis.text.x=element_text(size=8,angle=45,hjust=1)
,axis.text.y=element_text(size=8)
#,strip.background = element_rect(fill='#636363')
)
}
plot_operations_get = function(filtered,title='',subtitle='',ncol = plotCols,facets='Operation'){
if(is.null(filtered)) return(ggplot(data=filtered)+geom_blank())
plotData = filtered %>% mutate(Operation = factor(Operation))
p= ggplot(data=plotData, aes_string(x='Dataset',y=targetVar,color='Language')) +
ggplot2::stat_summary(aes(group=Language),fun.y=median,geom='line',size=.5 )+
geom_jitter(alpha=0.5,size=.5,width=0.1)+
scale_y_continuous(labels=scales::comma_format(accuracy=1,suffix=targetUnits)) +
facet_wrap(facets,scales='free',ncol=ncol,labeller = label_wrap_gen(width = round(100/ncol))) +
#facet_grid(Operation~subType,scales='free',labeller = label_wrap_gen()) +
ggplot2::ggtitle(title,subtitle=subtitle)
ggplot_colors(p,legend.position = 'bottom'
,axis.text.x=element_text(size=8,angle=45,hjust=1)
,axis.text.y=element_text(size=8)
#,strip.background = element_rect(fill='#636363')
)
}
plot_operations = function(filtered,title,ncol=plotCols){
for (st in sort(unique(filtered$subType))){
data=filter(filtered,subType==st)
p=plot_operations_get(data,title=paste0(title,' - ' , st),subtitle=' (All Operations) ',ncol=ncol)
subchunkify(p, fig_height=ceiling(n_distinct(data$Operation)/ncol)*2.5+.5, fig_width=plotCols*3)
data=plot_get_significant(filter(filtered,subType==st))
if(!is.null(data)) {
p=plot_operations_get(data,title=paste0(title,' - ' , st ),subtitle=' (Significant Operations) ',ncol=ncol)
subchunkify(p, fig_height=ceiling(n_distinct(data$Operation)/ncol)*2.5+.5, fig_width=ncol*3)
}
}
}
plot_environments = function(filtered,title,ncol=plotCols){
for (st in sort(unique(filtered$subType))){
data=filter(filtered,subType==st)
p=plot_operations_get(data,title=paste0(title,' - ' , st),subtitle=' (All Operations) ',ncol=ncol,facets='Setup')
subchunkify(p, fig_height=ceiling(n_distinct(data$Setup)/ncol)*2.5+.5, fig_width=ncol*3)
}
}
getTable = function(data){
t=data %>% group_by(subType,Operation,Language) %>%
summarise_at(.vars=targetVar,.funs=vars(mean)) %>%
ungroup() %>%
spread(key=Language,value=targetVar) %>%
mutate(delta = Scala/PySpark)
knitr::kable(t,digits=2,col.names=c('Category','Operation','PySpark (MB/sec)','Scala (MB/sec)','Scala/PySpark'),padding=0,format='markdown')
}
Comparing Row operations
Rows Operations - All Environments:
| Filtering |
Filter |
42.36 |
52.77 |
1.25 |
| Filtering |
Filter Reg Ex 1 |
58.61 |
65.07 |
1.11 |
| Filtering |
Filter Reg Ex 2 |
58.95 |
65.89 |
1.12 |
| Run/Shift |
Running Sum |
12.17 |
96.33 |
7.91 |
| Run/Shift |
Shift (Lag) |
12.10 |
102.90 |
8.51 |
| Writing |
Writing 100 new rows |
18.00 |
32.03 |
1.78 |
| Writing |
Writing 1000 new rows |
18.05 |
31.30 |
1.73 |
| Writing |
Writing 10000 new rows |
15.39 |
29.28 |
1.90 |








Rows Operations - Setup: Databricks
| Filtering |
Filter |
45.42 |
55.07 |
1.21 |
| Filtering |
Filter Reg Ex 1 |
63.60 |
66.61 |
1.05 |
| Filtering |
Filter Reg Ex 2 |
63.54 |
66.08 |
1.04 |
| Run/Shift |
Running Sum |
11.04 |
89.81 |
8.14 |
| Run/Shift |
Shift (Lag) |
10.40 |
93.57 |
9.00 |
| Writing |
Writing 100 new rows |
17.00 |
27.27 |
1.60 |
| Writing |
Writing 1000 new rows |
16.97 |
26.63 |
1.57 |
| Writing |
Writing 10000 new rows |
14.43 |
22.66 |
1.57 |





Rows Operations - Setup: Local VM
| Filtering |
Filter |
26.49 |
29.93 |
1.13 |
| Filtering |
Filter Reg Ex 1 |
32.80 |
36.44 |
1.11 |
| Filtering |
Filter Reg Ex 2 |
33.27 |
38.13 |
1.15 |
| Run/Shift |
Running Sum |
10.07 |
69.53 |
6.90 |
| Run/Shift |
Shift (Lag) |
10.20 |
78.50 |
7.70 |
| Writing |
Writing 100 new rows |
13.48 |
22.56 |
1.67 |
| Writing |
Writing 1000 new rows |
14.03 |
22.03 |
1.57 |
| Writing |
Writing 10000 new rows |
11.51 |
22.05 |
1.92 |

## No Significant Elements



Rows Operations - Setup: Cluster
| Filtering |
Filter |
79.63 |
93.82 |
1.18 |
| Filtering |
Filter Reg Ex 1 |
119.14 |
119.26 |
1.00 |
| Filtering |
Filter Reg Ex 2 |
119.99 |
121.04 |
1.01 |
| Run/Shift |
Running Sum |
20.79 |
162.98 |
7.84 |
| Run/Shift |
Shift (Lag) |
21.49 |
170.36 |
7.93 |
| Writing |
Writing 100 new rows |
33.09 |
60.47 |
1.83 |
| Writing |
Writing 1000 new rows |
31.84 |
59.19 |
1.86 |
| Writing |
Writing 10000 new rows |
28.56 |
56.97 |
1.99 |

## Warning in wilcox.test.default(x = c(34.3061034620522, 37.3931337469713, :
## cannot compute exact p-value with ties


## Warning in wilcox.test.default(x = c(14.4125942386937, 13.3451278278002, :
## cannot compute exact p-value with ties



Comparing Columns operations
Columns Operations - All Environments:
| Joining |
Full Outer Join 10 Columns |
4.82 |
7.10 |
1.47 |
| Joining |
Full Outer Join 5 Columns |
3.02 |
4.72 |
1.57 |
| Joining |
Inner Join 10 Columns |
6.98 |
9.56 |
1.37 |
| Joining |
Inner Join 5 Columns |
5.91 |
7.85 |
1.33 |
| Joining |
Left Outer Join 10 Columns |
6.49 |
9.16 |
1.41 |
| Joining |
Left Outer Join 5 Columns |
4.47 |
6.17 |
1.38 |
| Mathematics |
Mathematical Operation on Columns |
97.35 |
105.61 |
1.08 |
| Merging |
Merge 10 columns into 1 |
14.25 |
26.11 |
1.83 |
| Merging |
Merge 2 columns into 1 |
16.27 |
27.90 |
1.71 |
| Merging |
Merge 5 columns into 1 |
15.21 |
27.21 |
1.79 |
| Sorting |
Sorting Asc 1 column |
8.54 |
11.18 |
1.31 |
| Sorting |
Sorting Asc 10 column |
7.95 |
10.56 |
1.33 |
| Sorting |
Sorting Asc 5 column |
8.53 |
10.84 |
1.27 |
| Sorting |
Sorting Desc 1 column |
8.34 |
11.54 |
1.38 |
| Sorting |
Sorting Desc 10 column |
8.04 |
10.67 |
1.33 |
| Sorting |
Sorting Desc 5 column |
8.36 |
11.05 |
1.32 |
| Splitting |
Split 1 Column into 10 |
10.61 |
20.04 |
1.89 |
| Splitting |
Split 1 Column into 5 |
12.30 |
23.38 |
1.90 |



## No Significant Elements










Columns Operations - Setup:Databricks
| Joining |
Full Outer Join 10 Columns |
4.39 |
5.11 |
1.16 |
| Joining |
Full Outer Join 5 Columns |
1.86 |
3.29 |
1.77 |
| Joining |
Inner Join 10 Columns |
5.41 |
7.38 |
1.36 |
| Joining |
Inner Join 5 Columns |
4.68 |
6.22 |
1.33 |
| Joining |
Left Outer Join 10 Columns |
5.24 |
6.79 |
1.30 |
| Joining |
Left Outer Join 5 Columns |
2.40 |
4.17 |
1.74 |
| Mathematics |
Mathematical Operation on Columns |
99.81 |
98.28 |
0.98 |
| Merging |
Merge 10 columns into 1 |
14.20 |
21.86 |
1.54 |
| Merging |
Merge 2 columns into 1 |
15.49 |
23.58 |
1.52 |
| Merging |
Merge 5 columns into 1 |
14.42 |
22.84 |
1.58 |
| Sorting |
Sorting Asc 1 column |
7.07 |
7.20 |
1.02 |
| Sorting |
Sorting Asc 10 column |
6.27 |
7.12 |
1.14 |
| Sorting |
Sorting Asc 5 column |
6.94 |
7.44 |
1.07 |
| Sorting |
Sorting Desc 1 column |
6.94 |
7.54 |
1.09 |
| Sorting |
Sorting Desc 10 column |
6.30 |
6.95 |
1.10 |
| Sorting |
Sorting Desc 5 column |
6.44 |
6.83 |
1.06 |
| Splitting |
Split 1 Column into 10 |
12.56 |
19.05 |
1.52 |
| Splitting |
Split 1 Column into 5 |
13.63 |
21.08 |
1.55 |

## Warning in wilcox.test.default(x = c(2.37938222506204, 2.05855504313115, :
## cannot compute exact p-value with ties
## Warning in wilcox.test.default(x = c(1.74957457405481, 1.37888996360214, :
## cannot compute exact p-value with ties
## Warning in wilcox.test.default(x = c(1.14419792326157, 0.947824107373032, :
## cannot compute exact p-value with ties
## Warning in wilcox.test.default(x = c(2.85941124924865, 2.97915639123432, :
## cannot compute exact p-value with ties
## Warning in wilcox.test.default(x = c(3.01828145151815, 2.11499141749447, :
## cannot compute exact p-value with ties
## Warning in wilcox.test.default(x = c(1.32239706378069, 0.781816877438383, :
## cannot compute exact p-value with ties


## No Significant Elements



## No Significant Elements

Columns Operations - Setup:Local VM
| Joining |
Full Outer Join 10 Columns |
3.34 |
3.65 |
1.09 |
| Joining |
Full Outer Join 5 Columns |
2.87 |
3.48 |
1.21 |
| Joining |
Inner Join 10 Columns |
6.42 |
6.73 |
1.05 |
| Joining |
Inner Join 5 Columns |
5.34 |
5.15 |
0.96 |
| Joining |
Left Outer Join 10 Columns |
5.68 |
5.87 |
1.03 |
| Joining |
Left Outer Join 5 Columns |
5.10 |
5.52 |
1.08 |
| Mathematics |
Mathematical Operation on Columns |
65.14 |
78.17 |
1.20 |
| Merging |
Merge 10 columns into 1 |
10.46 |
20.13 |
1.92 |
| Merging |
Merge 2 columns into 1 |
12.02 |
19.84 |
1.65 |
| Merging |
Merge 5 columns into 1 |
11.53 |
20.09 |
1.74 |
| Sorting |
Sorting Asc 1 column |
7.46 |
10.26 |
1.37 |
| Sorting |
Sorting Asc 10 column |
7.07 |
9.48 |
1.34 |
| Sorting |
Sorting Asc 5 column |
7.49 |
9.51 |
1.27 |
| Sorting |
Sorting Desc 1 column |
7.18 |
10.70 |
1.49 |
| Sorting |
Sorting Desc 10 column |
7.06 |
9.61 |
1.36 |
| Sorting |
Sorting Desc 5 column |
7.70 |
10.67 |
1.39 |
| Splitting |
Split 1 Column into 10 |
4.00 |
10.00 |
2.50 |
| Splitting |
Split 1 Column into 5 |
5.73 |
15.00 |
2.62 |

## Warning in wilcox.test.default(x = c(1.23965666298723, 1.10711647223027, :
## cannot compute exact p-value with ties
## Warning in wilcox.test.default(x = c(6.52882705721958, 6.927836689191,
## 6.99238882889498, : cannot compute exact p-value with ties
## Warning in wilcox.test.default(x = c(4.28220661253785, 5.55924105941596, :
## cannot compute exact p-value with ties
## Warning in wilcox.test.default(x = c(4.68869908681748, 5.59325942002566, :
## cannot compute exact p-value with ties
## Warning in wilcox.test.default(x = c(1.16502398340096, 1.27786882785268, :
## cannot compute exact p-value with ties
## Warning in wilcox.test.default(x = c(4.8713464409426, 6.47614097686192, :
## cannot compute exact p-value with ties
## No Significant Elements

## No Significant Elements





Columns Operations - Setup:Cluster
| Joining |
Full Outer Join 10 Columns |
10.56 |
14.53 |
1.38 |
| Joining |
Full Outer Join 5 Columns |
6.93 |
8.84 |
1.28 |
| Joining |
Inner Join 10 Columns |
13.37 |
16.76 |
1.25 |
| Joining |
Inner Join 5 Columns |
11.29 |
13.80 |
1.22 |
| Joining |
Left Outer Join 10 Columns |
12.62 |
17.17 |
1.36 |
| Joining |
Left Outer Join 5 Columns |
8.82 |
10.82 |
1.23 |
| Mathematics |
Mathematical Operation on Columns |
181.72 |
175.16 |
0.96 |
| Merging |
Merge 10 columns into 1 |
24.96 |
46.60 |
1.87 |
| Merging |
Merge 2 columns into 1 |
30.05 |
52.64 |
1.75 |
| Merging |
Merge 5 columns into 1 |
27.42 |
50.20 |
1.83 |
| Sorting |
Sorting Asc 1 column |
15.06 |
20.96 |
1.39 |
| Sorting |
Sorting Asc 10 column |
14.48 |
19.61 |
1.35 |
| Sorting |
Sorting Asc 5 column |
15.19 |
20.28 |
1.34 |
| Sorting |
Sorting Desc 1 column |
14.90 |
21.21 |
1.42 |
| Sorting |
Sorting Desc 10 column |
14.89 |
20.24 |
1.36 |
| Sorting |
Sorting Desc 5 column |
14.74 |
20.26 |
1.37 |
| Splitting |
Split 1 Column into 10 |
24.46 |
42.10 |
1.72 |
| Splitting |
Split 1 Column into 5 |
27.51 |
44.74 |
1.63 |



## Warning in wilcox.test.default(x = c(43.9746927716948, 43.3457651552285, :
## cannot compute exact p-value with ties
## No Significant Elements






Comparing Aggregate operations
Scala is faster than PySpark
Aggregate Operations - All Environments:
| Grouping |
GroupBy 1 column |
7.99 |
9.63 |
1.21 |
| Grouping |
GroupBy 10 columns |
2.82 |
4.56 |
1.61 |
| Grouping |
GroupBy 5 columns |
4.74 |
6.50 |
1.37 |
| Ranking |
Ranking by Group |
12.25 |
19.15 |
1.56 |





Aggregate Operations - Setup: Databricks
| Grouping |
GroupBy 1 column |
9.05 |
9.41 |
1.04 |
| Grouping |
GroupBy 10 columns |
3.07 |
4.07 |
1.33 |
| Grouping |
GroupBy 5 columns |
5.09 |
6.09 |
1.20 |
| Ranking |
Ranking by Group |
12.12 |
16.61 |
1.37 |



Aggregate Operations - Setup: Local VM
| Grouping |
GroupBy 1 column |
2.59 |
4.78 |
1.85 |
| Grouping |
GroupBy 10 columns |
1.27 |
2.51 |
1.97 |
| Grouping |
GroupBy 5 columns |
1.81 |
3.13 |
1.73 |
| Ranking |
Ranking by Group |
7.48 |
11.78 |
1.57 |



Aggregate Operations - Setup: Cluster
| Grouping |
GroupBy 1 column |
20.61 |
19.79 |
0.96 |
| Grouping |
GroupBy 10 columns |
6.58 |
9.64 |
1.46 |
| Grouping |
GroupBy 5 columns |
12.12 |
14.05 |
1.16 |
| Ranking |
Ranking by Group |
25.88 |
38.97 |
1.51 |




Comparing Mixed operations
Scala is faster than PySpark
Mixed Operations - All Environments:
| Pivots |
Pivot 1 Rows and 1 Column |
34.94 |
37.75 |
1.08 |
| Pivots |
Pivot 10 Rows and 1 Column |
13.03 |
17.39 |
1.33 |
| Pivots |
Pivot 5 Rows and 1 Column |
16.80 |
20.46 |
1.22 |


Mixed Operations - Setup: Databricks
| Pivots |
Pivot 1 Rows and 1 Column |
31.24 |
32.67 |
1.05 |
| Pivots |
Pivot 10 Rows and 1 Column |
11.75 |
13.32 |
1.13 |
| Pivots |
Pivot 5 Rows and 1 Column |
14.29 |
15.18 |
1.06 |

## No Significant Elements
Mixed Operations - Setup: Local VM
| Pivots |
Pivot 1 Rows and 1 Column |
24.48 |
28.52 |
1.17 |
| Pivots |
Pivot 10 Rows and 1 Column |
7.72 |
11.61 |
1.50 |
| Pivots |
Pivot 5 Rows and 1 Column |
9.96 |
14.73 |
1.48 |

Mixed Operations - Setup: Cluster
| Pivots |
Pivot 1 Rows and 1 Column |
73.14 |
66.36 |
0.91 |
| Pivots |
Pivot 10 Rows and 1 Column |
31.02 |
37.09 |
1.20 |
| Pivots |
Pivot 5 Rows and 1 Column |
41.89 |
42.48 |
1.01 |

## No Significant Elements
Comparing Runs operations
slight improvement with additonal runs (not statistically significant)
filtered=data_raw
filtered$Operation = forcats::fct_reorder(filtered$RunID,filtered[[targetVar]],.fun=median,desc=F)
## Warning: Some components of ... were not used: desc
p=ggplot(data=filtered, aes_string(x='RunID',y=targetVar,fill='Language',color='Language')) +
geom_boxplot(outlier.size = 0.5,size=0.5,alpha=0.6,outlier.shape = outliersShape) +
scale_y_continuous(limits = c(0, quantile(filtered[[targetVar]],probs=outliersQuantile))
,labels=scales::comma_format(accuracy=1,suffix=targetUnits))
ggplot_colors(p)
## Warning: Removed 1025 rows containing non-finite values (stat_boxplot).

By Setup
p=ggplot(data=filtered, aes_string(x='RunID',y=targetVar,fill='Language',color='Language')) +
geom_boxplot(outlier.size = 0.5,size=0.5,alpha=0.6,outlier.shape = outliersShape) +
scale_y_continuous(limits = c(0, quantile(filtered[[targetVar]],probs=outliersQuantile))
,labels=scales::comma_format(accuracy=1,suffix=targetUnits)) +
facet_wrap(~Setup,ncol=1)
ggplot_colors(p)
## Warning: Removed 1025 rows containing non-finite values (stat_boxplot).
